PART 3: Pan-Cancer signature analysis

Shixiang Wang, Ziyu Tao, Tao Wu, Xue-Song Liu (Corresponding author)

2020-12-22

In this part, we will analyze copy number signatures across cancer types and show the landscape.

Signature number and contribution in each cancer type

Load tidy cancer type annotation data.

library(sigminer)
library(tidyverse)

pcawg_types <- readRDS("../data/pcawg_type_info.rds")

To better describe the copy number signature landscape, here we use refitting activity data obtained from bootstrap procedure (see section “Reliable signature activity attribution” in PART 1).

pcawg_activity <- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")

Combine the cancer type annotation and activity data and only keep samples with good reconstruction (>0.75 cosine similarity).

keep_samps <- pcawg_activity$similarity >= 0.75

df_abs <- merge(pcawg_activity$abs_activity[keep_samps], pcawg_types, by = "sample")
df_rel <- merge(pcawg_activity$rel_activity[keep_samps], pcawg_types, by = "sample")

Signature activity in each cancer type

Here we draw distribution of a signature across cancer types.

show_group_distribution(
  df_abs,
  gvar = "cancer_type",
  dvar = "Sig1",
  order_by_fun = FALSE,
  g_angle = 90,
  point_size = 0.3
)

We have many signatures here, so we output them to PDF files.

dir.create("../output/cancer-type-dist", showWarnings = F)
signames <- paste0("Sig", 1:11)
for (i in signames) {
  pxx <- show_group_distribution(df_abs,
    gvar = "cancer_type",
    dvar = i, order_by_fun = FALSE,
    ylab = i,
    g_angle = 90, point_size = 0.3
  )
  ggplot2::ggsave(file.path("../output/cancer-type-dist/", paste0("Absolute_activity_", i, ".pdf")),
    plot = pxx, width = 12, height = 6
  )
  pxx <- show_group_distribution(df_rel,
    gvar = "cancer_type",
    dvar = i, order_by_fun = FALSE,
    ylab = i,
    g_angle = 90, point_size = 0.3
  )
  ggplot2::ggsave(file.path("../output/cancer-type-dist/", paste0("Relative_activity_", i, ".pdf")),
    plot = pxx, width = 12, height = 6
  )
}
rm(pxx)

Signature landscape

Define a signature which is detectable if this signature contribute >5% exposures in a sample.

df <- df_rel %>%
  dplyr::mutate_at(dplyr::vars(dplyr::starts_with("Sig")), ~ ifelse(. > 0.05, 1L, 0L)) %>%
  tidyr::pivot_longer(
    cols = dplyr::starts_with("Sig"),
    names_to = "sig", values_to = "detectable"
  )

df2 <- df_rel %>%
  tidyr::pivot_longer(
    cols = dplyr::starts_with("Sig"),
    names_to = "sig", values_to = "expo"
  )
df <- dplyr::left_join(df, df2,
  by = c("sample", "cancer_type", "sig")
)

df_type <- df %>%
  dplyr::group_by(cancer_type, sig) %>%
  dplyr::summarise(
    freq = sum(detectable), # directly use count
    expo = median(expo[detectable == 1]),
    n = n(),
    label = paste0(unique(cancer_type), " (n=", n, ")"),
    .groups = "drop"
  )

mps <- unique(df_type[, c("cancer_type", "label")])
mpss <- mps$label
names(mpss) <- mps$cancer_type
summary(df_type$freq)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    6.00   15.00   30.42   36.00  247.00 

Show copy number signature landscape.

library(cowplot)

p <- ggplot(
  df_type,
  aes(x = cancer_type, y = factor(sig, levels = paste0("Sig", 1:11)))
) +
  geom_point(aes(size = freq, color = expo)) +
  theme_cowplot() +
  ggpubr::rotate_x_text(60) +
  scale_x_discrete(breaks = mps$cancer_type, labels = mps$label) +
  scale_size_continuous(
    limits = c(5, 250),
    breaks = c(5, 20, 50, 100, 200)
  ) +
  scale_color_stepsn(
    colors = viridis::viridis(5, direction = -1),
    breaks = c(0, 0.25, 0.5, 0.75, 1)
  ) +
  labs(
    x = NULL, y = "Copy number signatures",
    color = "Median activity\ndue to signature",
    size = "Tumors with\nthe signature"
  )
p

ggsave("../output/CNS_PCAWG_landscape.pdf",
  plot = p,
  height = 6, width = 12
)

Cancer type associated enrichment

Run enrichment analysis.

enrich_result <- group_enrichment(
  df_abs,
  grp_vars = "cancer_type",
  enrich_vars = paste0("Sig", 1:11),
  co_method = "wilcox.test"
)

Show enrichment landscape.

enrich_result$enrich_var <- factor(enrich_result$enrich_var, paste0("Sig", 1:11))
p <- show_group_enrichment(enrich_result, fill_by_p_value = TRUE, return_list = T)
p <- p$cancer_type + labs(x = NULL, y = NULL)
p

ggsave("../output/CNS_PCAWG_enrichment_landscape.pdf",
  plot = p,
  height = 8, width = 6
)

To better visualize the enrichment results, we use binned color regions.

p <- show_group_enrichment(
  enrich_result,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cancer_type + labs(x = NULL, y = NULL)
p

ggsave("../output/CNS_PCAWG_enrichment_landscape2.pdf",
  plot = p,
  height = 8, width = 6
)

We see cancer type SoftTissue-Liposarc has pretty high enrichment on Sig6. Let’s check the enrichment result.

enrich_result[grp1 == "SoftTissue-Liposarc"]
        grp_var enrich_var                grp1 grp2 grp1_size grp1_pos_measure
 1: cancer_type       Sig1 SoftTissue-Liposarc Rest        19       107.747229
 2: cancer_type       Sig2 SoftTissue-Liposarc Rest        19        45.210722
 3: cancer_type       Sig3 SoftTissue-Liposarc Rest        19        17.841799
 4: cancer_type       Sig4 SoftTissue-Liposarc Rest        19        53.524895
 5: cancer_type       Sig5 SoftTissue-Liposarc Rest        19         9.332606
    grp2_size grp2_pos_measure measure_observed measure_tested      p_value
 1:      2621        17.690767        6.0905910             NA 2.618595e-02
 2:      2621        14.791334        3.0565682             NA 5.269566e-01
 3:      2621        15.222509        1.1720669             NA 7.519819e-02
 4:      2621        13.483545        3.9696455             NA 1.051345e-03
 5:      2621        14.027543        0.6653058             NA 2.968415e-02
          type      method
 1: continuous wilcox.test
 2: continuous wilcox.test
 3: continuous wilcox.test
 4: continuous wilcox.test
 5: continuous wilcox.test
 [ reached getOption("max.print") -- omitted 6 rows ]

We can see mean activity 486 (n=19) vs 9 (n=2621).

Let’s go further plot the distribution for the two groups.

df_check <- df_abs[, c("Sig6", "cancer_type")][
  , .(
    cancer_type = ifelse(cancer_type == "SoftTissue-Liposarc",
      "SoftTissue-Liposarc",
      "Others"
    ),
    Sig6 = Sig6
  )
]
ggpubr::ggboxplot(
  df_check,
  x = "cancer_type", y = "Sig6",
  fill = "cancer_type",
  xlab = FALSE, width = 0.3
)

Check copy number distribution for the "SoftTissue-Liposarc" samples.

samples <- df_abs[cancer_type == "SoftTissue-Liposarc"]$sample

pcawg_cn_obj <- readRDS("../data/pcawg_cn_obj.rds")
cn_dt <- subset(pcawg_cn_obj@data, sample %in% samples)
cn_dt$segLen <- cn_dt$end - cn_dt$start + 1

Copy number value:

boxplot(cn_dt$segVal)

Segment length:

boxplot(cn_dt$segLen)

cn_dt_samp <- cn_dt[, .(nAMP = sum(segVal > 2)), by = sample]
boxplot(cn_dt_samp$nAMP)